import plotly.graph_objects as go, plotly.express as px
import scanpy as sc, pandas as pd, numpy as np
def get_allvals(x):
allvals = sorted(set(x))
return '<br>'+'<br>'.join([v for v in allvals])
def create_summary_obs(adata, max_unique=50, verbose=False):
"""Create a summary dataframe containing all unique sorted values along with counts for the `obs` dataframe in the AnnData object.
Args:
adata (AnnData): Input AnnData object containing an `obs` dataframe that we can summarize and view.
max_unique (int, optional): Threshold upto which we show all values on the interactive plot. Defaults to 50.
verbose (bool, optional): Flag to indicate logging in verbose mode. Defaults to False.
Returns:
pd.DataFrame: Summarized `obs` dataframe.
"""
if verbose: print(f'Getting unique values per columns in the obs-dataframe.')
unique_vals_per_col_df = pd.DataFrame(adata.obs.nunique()).reset_index()
unique_vals_per_col_df.columns = ['colname','nunique']
if verbose: print(f'Filtering out columns that have more than {max_unique} values.')
cols_to_view_in_detail = unique_vals_per_col_df.loc[unique_vals_per_col_df['nunique'] < max_unique , 'colname'].tolist()
if verbose: print(f'Pulling all values for columns that have less than {max_unique} values.')
details_subset_df = pd.DataFrame(adata.obs.loc[:, cols_to_view_in_detail].apply( get_allvals ).T).reset_index()
details_subset_df.columns = ['colname', 'unique']
if verbose: print(f'Creating the final merged dataframe containing all unique sorted values along with counts.')
summary_obs_df = pd.merge(left=unique_vals_per_col_df, right=details_subset_df, how='left', on='colname')
return summary_obs_df
def plot_obs_barchart(adata, max_unique=50, dataset_name='Dummy', verbose=False):
"""Create a summary dataframe and plot a barchart to show general distribution of metadata available in the AnnData object.
Args:
adata (AnnData): Input AnnData object containing an `obs` dataframe that we can summarize and view.
max_unique (int, optional): Threshold upto which we show all values on the interactive plot. Defaults to 50.
dataset_name (str, optional): Textual name for the input AnnData dataset. Defaults to 'Dummy'.
verbose (bool, optional): Flag to indicate logging in verbose mode. Defaults to False.
Returns:
plotly.Figure: A figure that we can plot later using `fig.show()`
"""
summary_obs_df = create_summary_obs(adata, max_unique=max_unique, verbose=verbose)
fig = go.Figure([
go.Bar(
x=summary_obs_df['colname'],
y=summary_obs_df['nunique'],
name='Counts for all metadata within AnnData.Obs',
marker_color='rgb(39, 43, 102)',
marker_line_color='rgb(8,48,107)',
hovertext=summary_obs_df['unique'].fillna('## Too many values ##'),
hovertemplate=\
'Column: %{x}' + \
'<br>Unique-%{y}' + \
'<br>%{hovertext}',
marker_line_width=.8,
opacity=0.6
)
])
fig.update_layout(
yaxis={'title':'Unique Counts'},
xaxis={'title':'CellType label', 'categoryorder':'category ascending'},
title=f'Summarized metadata from the "{dataset_name}" dataset',
width=800,
height=800,
updatemenus=[
dict(
buttons=list([
dict(
args=[{'yaxis.type': 'linear'}],
label='Linear scale',
method='relayout'
),
dict(
args=[{'yaxis.type': 'log'}],
label='Log Scale',
method='relayout'
)
]),
direction='down',
showactive=True,
x=1.,
y=1.1
)
]
)
fig.update_yaxes(tick0=0, dtick=1)
if verbose: fig.show()
return fig
def plot_obs_treemap(adata, max_unique=50, dataset_name='Dummy', verbose=False):
"""Create a summary dataframe and plot a treemap to show general distribution of metadata available in the AnnData object.
Args:
adata (AnnData): Input AnnData object containing an `obs` dataframe that we can summarize and view.
max_unique (int, optional): Threshold upto which we show all values on the interactive plot. Defaults to 50.
dataset_name (str, optional): Textual name for the input AnnData dataset. Defaults to 'Dummy'.
verbose (bool, optional): Flag to indicate logging in verbose mode. Defaults to False.
Returns:
plotly.Figure: A figure that we can plot later using `fig.show()`
"""
summary_obs_df = create_summary_obs(adata, max_unique=max_unique, verbose=verbose)
summary_obs_df.loc[:, 'nunique_log'] = np.log(summary_obs_df.loc[:, 'nunique'])
fig = px.treemap(
summary_obs_df,
path=['colname'],
values='nunique',
color='colname',
hover_data={'nunique': True, 'unique' : True},
hover_name='colname'
)
fig.update_layout(
margin = dict(t=50, l=25, r=25, b=25),
yaxis={'title':'Unique Counts'},
xaxis={'title':'CellType label', 'categoryorder':'category ascending'},
title=f'Summarized metadata from the "{dataset_name}" dataset',
width=800,
height=800
)
fig.update_traces(marker={'colorscale':'viridis'})
if verbose: fig.show()
return fig
ANNDATA_FOLDER = 'Datasets'
QUERY_DATASET_NAME = 'LCA'
ABS_FILE_PATH = f'{ANNDATA_FOLDER}/{QUERY_DATASET_NAME}/{QUERY_DATASET_NAME}.h5ad'
# "LCA.h5ad" file contains existing annotations using a human-in-the-loop strategy by the authors
query_adata = sc.read_h5ad(ABS_FILE_PATH)
fig = plot_obs_barchart(query_adata, max_unique=50, dataset_name=QUERY_DATASET_NAME)
fig.show()
fig = plot_obs_treemap(query_adata, max_unique=50, dataset_name=QUERY_DATASET_NAME)
fig.show()
C:\ProgramData\Anaconda3\lib\site-packages\plotly\express\_core.py:1637: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
import os
from subprocess import Popen, PIPE
def run_external_script(script_command, script_name, args):
"""Invokes the R-code from Python using 32-bit Rscript 3.4.4 command.
Uses the Python subprocess module to create a new Pipe.
Args:
script_command (str): Rscript/python
script_name (str): Script-to-invoke
args (str): Args-for-script
"""
try:
cmd = [script_command, script_name, args]
pipe = Popen( cmd, stdin=PIPE, stdout=PIPE, stderr=PIPE )
output, error = pipe.communicate()
script_command = script_command.capitalize()
if pipe.returncode == 0:
print(f'{script_command} OUTPUT:\n')
print(output.decode())
else:
print(f'{script_command} OUTPUT:\n')
print(output.decode())
print(f'{script_command} ERROR:\n')
print(error.decode())
except Exception as e:
print(f'\nSomething went wrong in the {script_command}-Script {script_name}.')
print(e)
PYTHON_CMD = 'python'
SCRIPT_NAME = 'CellTypist/celltypist_prediction_pipeline.py'
OUTPUT_PREDICTIONS_FILE = 'celltypist_preds.csv'
MOUNT_GOOGLE_DRIVE = 'False'
CELLTYPIST_MODEL_NAME = 'Human_Lung_Atlas.pkl'
EXISTING_ANNOTATIONS_COLUMN = 'cell_ontology_type'
args = f'--mount-google-drive="{MOUNT_GOOGLE_DRIVE}" \
--existing-annotations-column="{EXISTING_ANNOTATIONS_COLUMN}" \
--folder-name="{ANNDATA_FOLDER}" \
--dataset-name="{QUERY_DATASET_NAME}" \
--output-predictions-file="{OUTPUT_PREDICTIONS_FILE}" \
--model-name="{CELLTYPIST_MODEL_NAME}"'
print(PYTHON_CMD, SCRIPT_NAME, args)
run_external_script(PYTHON_CMD, SCRIPT_NAME, args)
# RSCRIPT_CMD = '/N/soft/rhel7/r/4.1.1/lib64/R/bin/Rscript'
RSCRIPT_CMD = 'RScript'
SCRIPT_NAME = 'Azimuth/azimuth_prediction_pipeline.R'
OUTPUT_PREDICTIONS_FILE = "azimuth_preds.tsv"
'''
Choose one of Azimuth's references:
"adiposeref", "bonemarrowref", "fetusref", "heartref",
"humancortexref", "kidneyref", "lungref", "mousecortexref",
"pancreasref", "pbmcref", "tonsilref"
'''
REFERENCE = "lungref"
args = f'{ANNDATA_FOLDER} {QUERY_DATASET_NAME} {OUTPUT_PREDICTIONS_FILE} {REFERENCE}'
# /N/soft/rhel7/r/4.1.1/lib64/R/bin/Rscript Azimuth/azimuth_prediction_pipeline.R "Datasets" "LCA" "azimuth_preds_new.csv"
print(RSCRIPT_CMD, SCRIPT_NAME, args)
run_external_script(RSCRIPT_CMD, SCRIPT_NAME, args)
Modularizing the PopV Tutorial is tricky. Keeping this aside for now.
# For Lung datasets, run the PopV tutorial ipynb file for now.